Data Challenge: MTA Subway Turnstile Counts

Phase 1: Data Wrangling

For the data challenge I need to scrape the text file data from the mta website. After looking over the website and inspecting a couple of files, I noted some challenges to keep in mind when collecting and organizing the data:

  • there are two different data types - before and after 10/18/14
    • the older files have multiple entries per line, which will need to be separated
    • the older files are also missing a few fields (station, linename, division)
      • this information seems to be available in separate file, although not entirely complete
    • the older files are formatted using mm-dd-yy while the new files are mm/dd/yy
  • there are 52 files for 2013, which each hold a week's worth of data
  • to make working with the data more manageable, use sqlite with python
    • this will also make it easier to add in other years if I wanted to expand the analysis

Initializing workspace

In [99]:
import sqlite3 as sql
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import pandas as pd
import plotly as py
import plotly.tools as tls
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode()
In [2]:
mta_url = 'http://web.mta.info/developers/turnstile.html'
db_name = 'TURNSTILES.db'
table_name = 'thirteen'
headers = ["CA", "UNIT", "SCP", "STATION", "LINE", "DIVISION", "DATE", "TIME", "DESC", "ENTRIES", "EXITS"]
types = ["TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "TEXT", "DATE", "TIME", "TEXT", "INTEGER", "INTEGER"]
  • Use Requests & BeautifulSoup to get url data and then parse html content
  • I am going to get all of the links currently present, and from there I can filter for the year I want
    • From inspecting the html I noticed that the files I want all of the pattern "data/nyct/turnstile/turnstile" in the links, so I am going to use regular expressions to get only the links that I want
    • From a ctrl+f I know that there are 301 occurrences of my pattern, so I'm expecting 301 links to be in my dictionary
    • From here, I can either move forward with the full dictionary if I want all the links, or for this purpose I can call another function to create a smaller dictionary of only the links of interest (then I can delete the larger dictionary if I no longer want it)
In [ ]:
page_data = requests.get(mta_url)
pg_dat = page_data.content
content = BeautifulSoup(pg_dat, 'html.parser')  
page_links = content.find_all("a", href=True)  

t_links = {}
for link in page_links:  
    if re.match('^data/nyct/turnstile/turnstile.*', link['href']):  
        t_links[link.text] = 'http://web.mta.info/developers/' + link['href']
print len(t_links), 'links in full dictionary'


'''
get links for 2013
'''


def year_links(url_dict, year):
    y_links = {}
    url_dict = t_links
    for key in url_dict:
        key_year = key.split(',')[2].split()
        if str(year) in key_year:
            y_links[key] = url_dict[key]
    return y_links

year = 2013
y_links = year_links(t_links, year)

print len(y_links), 'links in %d dictionary' % year

# the last few days of 2013 are in the first file of 2014
y_links['Saturday, January 04, 2014'] = t_links['Saturday, January 04, 2014']

Set up database and functions to populate database

  • I am going to take advantage of sqlite in python to create a database with a table called 'thirteen' to oranize the date as I read it in
  • I know that I need a function to reformat the older files so that they match the newer files and fit into my table structure
    • I need to separate the multiple entries that currently exist on one line of the older files
    • I want to keep the C/A, UNIT, and SCP variables at the beginning of each line, and then repeat those for the other entries
    • I need to add in a null value for the missing variables: station, linename, and division
In [ ]:
'''
set up my database mta.db and create table thirteen
'''


def database_setup(db_name, table_name):

    header_and_types = "id INTEGER PRIMARY KEY AUTOINCREMENT, " + ", ".join([headers[i] + ' ' +
                                                                             types[i] for i in range(len(headers))])
    global conn, cursor
    conn = sql.connect(db_name)
    cursor = conn.cursor()
    create_string = 'CREATE TABLE IF NOT EXISTS '+ table_name + ' (' + header_and_types + ')'
    cursor.execute(create_string)
    return

# run this function to set up database, commit, and then close connection for now
database_setup(db_name, table_name)
conn.commit()
conn.close()
In [ ]:
'''
create function to update format of older files - only older files will be passed to this function
'''

def update_format(row):
    
    out_row = []  
    len_id = 3  # chunk of 3 ID variables want to keep & repeat
    len_entry = 5  # chunk the entries within the currently longer row
    cells = row.split(',')  
    
    id_vars = ",".join(cells[0:len_id])

    # match to dataframe to get station, linename, division info (if available) 
    remote = cells[1]
    booth = cells[0] 

    find_station = df.Station[df.Remote == remote][df.Booth == booth].values
    station = find_station[0] if len(find_station) > 0 else 'NULL'

    find_linename = df.Line[df.Remote == remote][df.Booth == booth].values
    linename = find_linename[0] if len(find_linename) > 0 else 'NULL'

    find_division = df.Division[df.Remote == remote][df.Booth == booth].values
    division = find_division[0] if len(find_division) > 0 else 'NULL'
    
    for entry in range(len_id, len(cells), len_entry):
        
        # make date match format of newer files
        date = datetime.strftime(datetime.strptime(cells[entry], "%m-%d-%y"), "%m/%d/%Y")

        added_info = ",".join([station, linename, division, date])
        cleaned_row = id_vars + ',' + added_info + ',' + ','.join(cells[entry + 1: entry + len_entry])
        
        out_row.append(cleaned_row)
        
    return out_row
In [ ]:
'''
create function to add entry lines from text files and populate the database
'''

def add_entry(input_line, table_name):

    values = input_line.split(',')
    values = [j.strip() if j != 'NULL' else None for j in values]  # for null values, replace with none
    values.insert(0, None)  # add empty value for id/primary key - auto populate
    value_slots = ",".join(['?']*(len(headers)+1))
    insert_string = 'INSERT INTO ' + table_name + ' VALUES(' + value_slots + ')'

    if len(values) == 12:
        cursor.execute(insert_string, values)
In [ ]:
'''
create function to loop through the links in my dictionary and populate the database
'''


def populate_db(url_dict, db_name, table_name, outfile_name):
  
    # read in .csv of station keys for old files
    global df
    df = pd.read_csv('turnstiles_remote_station_key.csv')
    df.columns = ['Remote', 'Booth', 'Station', 'Line', 'Division']

    with open(outfile_name, 'w') as outfile:  # text file to track progress
        
        ctr = 1
        outfile.write('Populating MTA database table turnstiles on' + datetime.now().ctime() + '\n')

        global conn, cursor
        conn = sql.connect(db_name)
        cursor = conn.cursor()


        for key in url_dict:
            url = url_dict[key]

            data = requests.get(url)
            header = next(data.iter_lines()).split(",")
            lines = data.iter_lines()
            
            if len(header) > 11:
                for line in lines:
                    entries = update_format(line)  

                    for entry in entries:
                        add_entry(entry, table_name)  

            else:
                lines.next()
                for line in lines:
                    if len(line) > 1:
                        add_entry(line, table_name)
                        
            outfile.write('populated data for file number ' + str(ctr) + ': ' + url_dict[key] + '\n')
            print 'populated data for file number ' + str(ctr) + ': ' + url_dict[key]
            ctr += 1
            
    

Execute database population function to scrape, process, and upload text files

  • Call populate function with dictionary of links, database name, table name, and a file name to track which files were entered
    • The populate_db function will output a text file with a statement that the file was uploaded if any lines are uploaded (or really just didn't throw errors). This doesn't necessarily reflect completeness of upload for that file. With more time would fix this
    • The populate_db does take a non trivial amount of time to execute for a year's worth of data, and so with more time would use trace function to see which steps could be cleaned up to shave off time. This would be particularly important if I wanted to add more years to my dataframe
In [ ]:
populate_db(y_links, db_name, table_name, 'downloading_2013_turnstile_data_02062016.txt')
In [ ]:
conn.commit()

Phase 2: Data Analysis

Challenges and issues to keep in mind:

From initial exploration fo the data I have noticed a few issues that have the potential to create big problems for the analysis ( Note this is only what I've found so far ):

  1. The entries and exits values represent an "odometer" so when I want to get the value for any given day, I need to do a subtraction.
  2. Not all station names are unique values, and some stations serve multiple divisions
  3. There are a variety of different types of report descriptions - for the sake of simplicity, I am choosing to use only "regular" reports
  4. Unique turnstiles are represented by combinations of Station, Booth, and SCP
    • Ideally, I would have made a primary key for my database that reflected "turnstile id" - since I didn't do this, I'm going to have to group by a number of factors to prevent myself from double counting
  5. There is variation in the timing of turnstile reports
    • Stations generally report over fixed intervals, but the timing varies by station. So one station may start reporting at midnight and then proceed every four hours, while another may start at 2 AM
    • Some stations report more frequently than other stations
  6. There are instances where a turnstile seems to malfunction or "reset" and it's not always clear why this happened
    • For instance, for the 08/01/2013 data, a turnstile in the 42 St Bryant Park station appears to reset between 8:00 and 12:00. It's unclear why this happens, and I don't know when this happened within the time interval
    • In other cases, a turnstile may report a very large negative number, and the meaning of this valuue is unclear. For the sake of this analysis, I am excluding negative values
In [100]:
conn = sql.connect(db_name)
cursor = conn.cursor()

1. Identify which station has the most number of units

In [ ]:
'''
query number of turnstiles by station
'''

num_stiles_query = "SELECT COUNT(DISTINCT SCP) as turnstiles, STATION, DIVISION, CA FROM thirteen " \
                   "WHERE STATION is not NULL GROUP BY STATION, "\
                    "DIVISION ORDER BY turnstiles DESC ;"

top_units = pd.read_sql_query(num_stiles_query, conn)

top_units[0:10]
In [5]:
penn_units = top_units['turnstiles'][top_units['STATION']== '34 ST-PENN STA'].sum()
time_sq_units = pd.concat([top_units['turnstiles'][top_units['STATION'] == '42 ST-TIMES SQ'],
                           top_units['turnstiles'][top_units['STATION'] == '42 ST-PA BUS TE']]).sum()

print 'The Times Square Station complex has {0} turnstiles, and the Penn Station complex has {1} turnstiles '.format(
    *[time_sq_units, penn_units])
The Times Square Station complex has 89 turnstiles, and the Penn Station complex has 83 turnstiles 
In [7]:
'''
cleanup
'''
del top_units, penn_units, time_sq_units

Discussion of answer and approach

  • Defining station as distinct station name and division, the IND division of Penn Station has the most (52) units.
  • Taking into account that Penn Station and Times Square are both stations that serve multiple divisions, which appear in the top 10 counts of units per station, I considered these stations as complex stations and added the turnstile counts across divisions. Additionally, I combined the Times Sq and Port Authority Bus Terminal into one station given that the MTA website considers this one station complex.
    • The Times Square complex has 89 turnstiles
    • The Penn Station complex has 83 turnstiles
  • 42 St Times Square and the 42 St Port Authority, considered as one station, have the most turnstile units
  • There was some missing station data (not all turnstiles can be matched to a station using the MTA's key)

2. Report the total number of entries and exits across the subway system for 08/01/2013

In [ ]:
'''
Create new table to make future queries a little easier
'''

create_daily_table = '''CREATE TABLE daily as select * from (SELECT min(end), DATE, entryend, exitend, SCP, CA, DIVISION, STATION from

(SELECT MAX(TIME) as end, DATE, ENTRIES as entryend, EXITS as exitend, SCP, CA, DIVISION, STATION from
        (SELECT * FROM thirteen WHERE  DESC = 'REGULAR' AND ENTRIES >= 0 AND
                        EXITS >= 0 AND STATION IS NOT NULL GROUP BY STATION, DIVISION, SCP, CA, DATE, TIME)
         GROUP BY DATE, STATION, DIVISION, CA, SCP
UNION
SELECT MIN(TIME) as start, DATE, ENTRIES as entrystart, EXITS as exitstart, SCP, CA, DIVISION, STATION from
          (SELECT * FROM thirteen WHERE  DESC = 'REGULAR' AND ENTRIES >= 0 AND
                   EXITS >= 0 AND STATION IS NOT NULL GROUP BY STATION, DIVISION, SCP, CA, DATE, TIME)
           GROUP BY DATE, STATION, DIVISION, CA, SCP

ORDER BY STATION, SCP)
GROUP BY DATE, STATION, DIVISION, SCP ORDER BY STATION, SCP)
;'''

cursor.execute(create_daily_table)
In [160]:
'''
Query database and organize into dataframes
'''

aug1_query = "SELECT STATION, DIVISION, CA, DATE, SCP, entryend as ENTRIES, exitend as EXITS, " \
                 "entryend + exitend as BUSY from daily WHERE entryend >= 0 AND exitend >=0 " \
                 "AND DATE BETWEEN '08/01/2013' AND '08/02/2013' GROUP BY DATE, STATION, DIVISION, CA, SCP " \
                 "ORDER BY STATION, DIVISION,SCP, DATE ;"

        
## get specific turnstile info to get busiest turnstile

aug_df = pd.read_sql_query(aug1_query, conn)


aug_df['STATION'] = aug_df['STATION'].astype('category')
aug_df['DIVISION'] = aug_df['DIVISION'].astype('category')
aug_df['DATE'] = pd.to_datetime(aug_df['DATE']) 
In [161]:
'''
Get turns/day for entries, exits, and turns & drop problem observations
'''

def calculate_differences(df, cols_to_diff):
    for col in cols_to_diff:
        new_col = col + '_DIFF'
        df[new_col] = df.groupby(['STATION', 'DIVISION','SCP'])[col].diff() 

        df[new_col] = df[new_col].where(df[new_col] >= 0, 'NaN').astype('float')
        df[new_col] = df[new_col].where(df[new_col] < 50000, 'NaN').astype('float')
    df['DATE'] = df['DATE'] - timedelta(days=1)
    return df


aug_df = calculate_differences(aug_df, ['ENTRIES', 'EXITS', 'BUSY'])
In [162]:
'''
sum entries and exits across all stations for 08/01/2013
'''
total_entries = aug_df[aug_df.DATE == '2013-08-01']['ENTRIES_DIFF'].sum()                    


print 'approximate number of entries on 08/01/2013: ', int(total_entries)

total_exits = aug_df[aug_df.DATE == '2013-08-01'].groupby(['DATE'])['EXITS_DIFF'].sum().values[0]

print 'approximate number of exits on 08/01/2013: ', int(total_exits)

print 'approximate number of total turns on 08/01/2013', int(total_entries + total_exits)
approximate number of entries on 08/01/2013:  5020544
approximate number of exits on 08/01/2013:  3961365
approximate number of total turns on 08/01/2013 8981909

Discussion of approach and answer:

  • I queried 8/1/2013 and 8/2/2013 for each turnstile's entry, exit, and total turn count.
    • I ruled out negative turnstile output
    • I only queried output classified as 'regular'
  • I excluded 'reset' or disrupted turnstile counts
    • if a turnstile's count was negative it was set to NaN
    • I set a somewhat liberal threshold of 50K to identify clearly inaccurate turnstile counts and set those to NaN
    • another approach could have been to replace that turnstile's count with the mean of the other counts at that station
  • The number of entries using this approach was 5,020,544 and the number of exits using this approach was 3,961,365
    • This gives the total amount of activity as 8,981,909
    • To check if this order of magnitude was realistic, I decided to quantify a "rider" as one entry and exit. This means that my approach gives an approximation of 4,490,954 riders in one day, which suggests that the subway system serves 1,639,198,210 riders a year. This is of the order of magnitude of numbers reported by the MTA website.

3. Defining busy-ness as the sume of entry and exit count:

Identify the busiest station on 08/01/2013

In [ ]:
aug_df[aug_df['DATE'] == "2013-08-01"].groupby(['STATION',
                                                  'DIVISION'])['BUSY_DIFF'].sum().sort_values(ascending = False)[0:10]

Defining station as having a distinct name and division, the busiest station is Grand Central Station with a total turnstile count of 176,960 turns. However, I know that Penn Station and Times Square stations serve multiple divisions. So I am going to look at the business for both of these stations, combining across divisions (and in the case of Times Square, the other division has a different station name).

In [57]:
penn_busy = aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '34 ST-PENN STA']['BUSY_DIFF'].sum()

times_busy = aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '42 ST-TIMES SQ']['BUSY_DIFF'].sum() 
times_busy += aug_df[aug_df['DATE'] == "2013-08-01"][aug_df['STATION'] == '42 ST-PA BUS TE']['BUSY_DIFF'].sum()

print 'Times Square had {0} total turns and Penn Station had {1} total turns'.format(*[int(times_busy), int(penn_busy)])
Times Square had 310074 total turns and Penn Station had 265265 total turns

If I take into account the fact that these hub stations serve multiple divisions, Times Square is the busiest station with a total of 310,074 turns. Additionally, Penn Station has 265,265 turns, which is greater than the number of turns for Grand Central Station.

Identify the busiest turnstile on 08/01/2013

In [ ]:
busy_turnstile = aug_df[aug_df['DATE'] == '2013-08-01'][aug_df['BUSY_DIFF'] == aug_df['BUSY_DIFF'].max()]

t_vals_print = [busy_turnstile['CA'].values[0], busy_turnstile['SCP'].values[0],busy_turnstile['STATION'].values[0], 
                busy_turnstile['BUSY_DIFF'].values[0]]
print 'the busiest turnstile was {0} {1} in station {2} with {3} turns'.format(*t_vals_print) 

The busiest turnstile on 08/01/2013 was N063a 00-00-00 in 42 ST Port Authority with 11,845 turns.

In [59]:
'''
cleanup
'''
del aug_df

4. Identify the stations that have seen the most growth and decline in usage in 2013

Conceptualizing Growth and Decline:

  • Given that I have an 'odometer' type reading of turns (busy-ness), I should expect that the turns should increase each day, except when I have a reset or some other disruption. I decided to quantify turns per day, and then look for changes in turns per day. I define growth as an average positive change in turns per day, and decline as an average negative change in turns per day.
  • To quantify turns per day, I used the diff() function in pandas to subtract the previous day's turn count from the current day, for each station.
    • This method could result in negative or disproportionately large difference in the case of disruption/reset.
    • I dropped these instances by setting all negative first order differences to NaN.
    • Also, I know that the busiest stations see turns per day in the hundreds of thousands range, and so any count of turns per day significantly higher than this is likely noise. I set a liberal threshold of 800K for turns per day, and any counts exceeding this threshold were dropped (set to NaN).
  • To quantify change in turns per day, I used the diff() function again to quantify the fluctuations in rates of turns per day. I then took the average of this quantity for each station.
    • If a station experiences a constant rate of turns per month, this quantity should be about zero
    • if the rate of turns per month goes down, signaling decline in usage, this quantity will be negative
    • If the rate of turns per month increases, signaling growth in usage, this quantity will be positive
In [101]:
'''
query, pull in data frame, format categories
'''
all_days_query = "SELECT STATION, DIVISION, DATE, sum(entryend) as ENTRIES, sum(exitend) as EXITS, " \
                 "sum(entryend) + sum(exitend) as BUSY from daily GROUP BY DATE, " \
                 "STATION, DIVISION ORDER BY STATION, DIVISION ;"

days_df = pd.read_sql_query(all_days_query, conn)

days_df['STATION'] = days_df['STATION'].astype('category')
days_df['DIVISION'] = days_df['DIVISION'].astype('category')
days_df['DATE'] = pd.to_datetime(days_df['DATE']) 
In [102]:
'''
take difference between days of 'busy' to get turns per day - grouped by station/division
'''
def calculate_differences_noSCP(df, cols_to_diff):
    for col in cols_to_diff:
        new_col = col + '_DIFF'
        df[new_col] = df.groupby(['STATION', 'DIVISION'])[col].diff() 

        df[new_col] = df[new_col].where(df[new_col] >= 0, 'NaN').astype('float')
        df[new_col] = df[new_col].where(df[new_col] < 800000, 'NaN').astype('float')
    df['DATE'] = df['DATE'] - timedelta(days=1)
    return df

days_df = days_df[days_df.DATE <= '2013-12-31'][days_df.DATE >= '2013-01-01'].reset_index()
days_df = calculate_differences_noSCP(days_df, ['ENTRIES', 'EXITS', 'BUSY'])
In [ ]:
'''
sanity check: are my busiest stations reporting the highest number of turns per day?
'''
avg_diff = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].mean()
print avg_diff.sort_values(ascending = False)[0:10]

'''
get change in turns per day, and average change - grouped by station/division
'''

days_df['BUSY_DIFF2'] = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].diff()

avg_2nd_diff = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF2'].mean()

# which stations are declining?
print 'declining' 
print avg_2nd_diff.sort_values()[0:10]

#growing?
print 'growing'  
print avg_2nd_diff.sort_values(ascending = False)[0:10]

Results:

Based on my conceptualization of growth and decline within 2013 as average change in the rate of turns per day:

  • These are the top five stations that showed growth within 2013:
    1. Beach St 98
    2. Ft Hamilton Parkway
    3. Main St
    4. 42 St Grand Central
    5. Canal St
  • Theese are the top five stations that showed decline within 2013:
    1. Path WTC
    2. Grove St
    3. JFK Howard Beach
    4. Newark BM BW
    5. Exchange Place

Discussion of results and approach:

  • To look at growth and declines in usage over the course of 2013, I looked at the average rate of change of turns per day for each station. This gives me an idea of trends within the year, but this isn't the same as directly comparing average number of riders per station for 2012 and 2013. Therefore the changes I am seeing could reflect seasonal trends, construction projects, etc.

    • I could follow up this analysis with directly comparing 2013 to 2012, to see if the changes I detected were reflective of recurring seasonal trends or indicative of lasting usage growth/decline
  • Most of the stations showing decline were in the PATH division. JFK Howard Beach and Sutphin Blvd were the only stations in the top ten that weren't in the PATH divisions

5. Identify the dates within 2013 that were least busy

In [ ]:
av_turns_all_station = days_df.groupby('DATE')['BUSY_DIFF'].sum()


# get average turns/day overall
av_turns = av_turns_all_station.mean()

# average turns per day 7,966,954
# about 8 million turns per day

z_turns = (av_turns_all_station - av_turns_all_station.mean())/(av_turns_all_station.std())
z_turns.sort_values()[0:10]

Across the Subway system, the 5 least busy days were:

  • 12/25/2013 (Christmas Day)
  • 1/13/2013
  • 11/28/2013 (Thanksgiving Day)
  • 2/09/2013
  • 1/06/2013

Could you identify on which dates were stations closed or not operating at full capacity?

  • By comapring each station to its own average daily turn count, I could identify dates that the station is not operating at full capacity. Specifically, I would define low traffic days as those falling more than 2 standard deviations from the mean
In [164]:
'''
compare each station to its own daily average
'''
zscore = lambda x: (x - x.mean()) / x.std()
days_df['DAILYZ'] = days_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].transform(zscore)


'''
A station is experiencing low traffic if the number of turns is more than 2 standard deviations below the mean
'''
low_traffic = days_df[days_df['DAILYZ'] < -2]
  • A station is closed if there is no increase in the turn count from the day before
  • To get an idea if this closure is a somewhat isolated incident rather than pervasive (indicitave of long term construction for instance), I would examine the z-score of the date for the specific station. If the z-score is farther below zero, I know that this closure is not particularly representative of normal activity. If the z-score is closer to zero, this tells me that there were multiple closures that affected the overall mean, and therefore this station likely experiences more closures.
In [ ]:
'''
a station will be closed if the difference in busy-ness between days is 0
'''
closed = days_df[days_df['BUSY_DIFF'] == 0]
'''
To get an 
'''

closed.sort_values(by = ['DAILYZ','DATE'])[0:10]

Phase 3: Data Visualization

1. Plot the daily row counts for files in Q3 2013

In [105]:
'''
Query table thirteen to get a count of all observations (including irregular that were previously ignored)
'''

row_count_query = "SELECT DATE, COUNT(DISTINCT id) as rows FROM thirteen WHERE " \
                    "DATE BETWEEN '07/01/2013' AND '09/30/2013' GROUP BY DATE ;"
    
row_count_df = pd.read_sql_query(row_count_query, conn)
row_count_df['DATE'] = pd.to_datetime(row_count_df['DATE']) 
In [123]:
traces_row = []
traces_row.append(Scatter(x = row_count_df.DATE, y = row_count_df.rows, name='Number of Observations',
                          
                  line = dict(color= 'rgb(106,81,163)', width = '4') ))

iplot({"data": traces_row,"layout": { 'title':"Number of Turnstile Observations Per Day in Q3 2013",
                                            'xaxis': {'title': 'Date'},
                                            'yaxis': {'title': 'Number of Observations'}}})

Note I conceptualized row count as number of observations per day. Because I separated out observations from the rows within each file, this number is larger than the actual "row" count, but by a constant value. Thus the trend should not be comprimised.

2. Plot the total number of entries and exits across the system for Q3 2013

In [108]:
q3_df = days_df[days_df.DATE >= '2013-07-01'][days_df.DATE < '2013-10-01']

# del days_df

turns_day = q3_df.groupby('DATE')['ENTRIES_DIFF', 'EXITS_DIFF'].sum()
turns_day.index.name = 'date'
turns_day = turns_day.reset_index()
turns_day.columns = ['Date', 'Entries', 'Exits']
In [109]:
traces = []
traces.append(Bar(x = turns_day.Date, y = turns_day.Entries, name='Entries', 
                  marker = dict(color= 'rgb(117,107,177)')))
traces.append(Bar(x = turns_day.Date, y = turns_day.Exits, name ='Exits', 
                  marker=dict(color='rgb(188,189,220)')))


iplot({"data": traces,"layout": {'barmode':'stack', 
                                           'title':"Daily Entries and Exits for Q3 2013",
                                            'xaxis': {'title': 'Date'},
                                            'yaxis': {'title': 'Total Turns from Entries and Exits'}}})

Note, the entries and exits are stacked to show the total turns per day. Interestingly, it seems that there are fewer exits than entries per day. Perhaps people are making use of the emergency exit doors?

3. Plot the mean and standard deviation of the daily total number of entries & exits for each month in Q3 2013 for station 34 ST-PENN Station

In [188]:
penn_df = q3_df[q3_df['STATION'] == '34 ST-PENN STA']
In [ ]:
penn_df['MONTH'] = penn_df['DATE'].apply(lambda x:datetime.strftime(x,'%B-%Y')) 
In [193]:
penn_entry_av = penn_df.groupby('MONTH')['ENTRIES_DIFF'].mean().round(2)
penn_entry_std = penn_df.groupby('MONTH')['ENTRIES_DIFF'].std()

penn_entry_av.index.name = 'month'
penn_entry_av = penn_entry_av.reset_index()
penn_entry_av.columns = ['Month', 'Average']
In [146]:
penn_exit_av = penn_df.groupby('MONTH')['EXITS_DIFF'].mean().round(2)
penn_exit_std = penn_df.groupby('MONTH')['EXITS_DIFF'].std()
penn_exit_av.index.name = 'month'
penn_exit_av = penn_exit_av.reset_index()
penn_exit_av.columns = ['Month', 'Average']
In [147]:
month_names = ['July', 'August', 'September']
traces2 = []
traces2.append(Bar(x = month_names, y = penn_entry_av.Average, name = 'Avg Entries',
                   error_y = dict(type = 'data', array = penn_entry_std, visible = True), 
                   marker = dict(color= 'rgb(117,107,177)')))
traces2.append(Bar(x = month_names, y = penn_exit_av.Average, name = 'Avg Exits',
                   error_y = dict(type = 'data', array = penn_exit_std, visible = True),
                  marker=dict(color='rgb(188,189,220)')))


layout = Layout(
        title = 'Average of Daily Turns at 34 St Penn Station in Q3 2013',
        xaxis = dict(title ='Month'),
        yaxis= dict (title ='Average Daily Turns'))

py.offline.iplot({
    'data' : traces2,
    'layout': layout})

4. Plot 25/50/75 percentile of the daily total number of entries & exits for each month in Q3 2013 for station 34 ST-PENN Station

In [150]:
traces3 = []
traces3.append(Box(x = penn_df['MONTH'], y = penn_df['ENTRIES_DIFF'], name = 'Entries',
                   marker = dict(color= 'rgb(117,107,177)')))
traces3.append(Box(x = penn_df['MONTH'], y = penn_df['EXITS_DIFF'], name = 'Exits',
                   marker=dict(color='rgb(188,189,220)')))


layout = Layout(
        title = 'Monthly Percentiles of Daily Turns at 34 St Penn Station in Q3 2013',
        xaxis = dict(
                    title = 'Month'),
        yaxis= dict (title ='Average Daily Turns'),
        boxmode = 'group')

py.offline.iplot({
    'data' : traces3,
    'layout': layout})

5. Plot the daily number of closed stations and number of stations that were not operating at full capacity in Q3 2013

In [151]:
q3_df['DAILYZ'] = q3_df.groupby(['STATION', 'DIVISION'])['BUSY_DIFF'].transform(zscore)


'''
A station is experiencing low traffic if the number of turns is more than 1 standard deviations below the mean
'''

q3_df['LOW'] = q3_df['DAILYZ'] < -2
low_traffic = q3_df.groupby(['DATE'])['LOW'].sum()

low_traffic.index.name = 'DATE'
low_traffic = low_traffic.reset_index()
low_traffic.columns = ['DATE', 'LOW']
In [152]:
'''
a station will be closed if the difference in busy-ness between days is 0
'''
q3_df['CLOSED'] = q3_df['BUSY_DIFF'] == 0
closed = q3_df.groupby(['DATE'])['CLOSED'].sum()

closed.index.name = 'DATE'
closed = closed.reset_index()
closed.columns = ['DATE', 'CLOSED']
In [153]:
traces4 = []
traces4.append(Bar(x = low_traffic.DATE, y = low_traffic.LOW, name='Below Capacity', 
                  marker = dict(color= 'rgb(117,112,179)')))
traces4.append(Bar(x = closed.DATE, y = closed.CLOSED, name ='Closed', 
                  marker=dict(color='rgb(217,95,2)')))


py.offline.iplot({"data": traces4,"layout": {'barmode':'group', 
                                           'title':"Stations Operating Below Capacity or Closed During Q3 2013",
                                            'xaxis': {'title': 'Date'},
                                            'yaxis': {'title': 'Number of Stations'}}})

To be continuted...

In [ ]: